From Command Line - Import CSV file (From Data Preparation Part 2) into MongoDB

mongoimport --db airbnb --type csv --file preprocessed_data_1_Apr.csv --headerline -c Listing



In [30]:

    
from pymongo import MongoClient



In [39]:

    
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_squared_error
import statsmodels.api as sm
import numpy as np









    



/Applications/anaconda/lib/python3.5/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)



In [31]:

    
import pandas as pd

Connect Python to MongoDB



In [32]:

    
client = MongoClient()



In [33]:

    
client = MongoClient('localhost', 27017)

Retrieve from Database

Database named as "airbnb"



In [34]:

    
db = client.airbnb



In [35]:

    
cursor = db.Listing.find()

Store data in pandas dataframe for analysis and preprocesssing



In [37]:

    
data = pd.DataFrame(list(cursor))



In [88]:

    
data.head()









    Out[88]:






  
    
      
      
      24-hour check-in
      Air conditioning
      Breakfast
      Cable TV
      Elevator in building
      Family/kid friendly
      Free parking on premises
      Free parking on street
      Gym
      ...
      review_scores_cleanliness
      review_scores_communication
      review_scores_location
      review_scores_rating
      review_scores_value
      reviews_per_month
      roomtype_Entire home/apt
      roomtype_Private room
      roomtype_Shared room
      security_deposit
    
  
  
    
      0
      0
      1
      0
      1
      0
      1
      0
      0
      0
      0
      ...
      10
      10
      10
      95
      10
      2.87
      0.0
      1.0
      0.0
      100
    
    
      1
      1
      0
      1
      1
      1
      1
      0
      0
      0
      0
      ...
      10
      10
      10
      95
      9
      3.12
      0.0
      1.0
      0.0
      200
    
    
      2
      2
      0
      0
      0
      0
      1
      1
      0
      0
      0
      ...
      5
      8
      9
      65
      7
      1.14
      0.0
      1.0
      0.0
      0
    
    
      3
      3
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      10
      9
      9
      92
      9
      1.42
      0.0
      1.0
      0.0
      100
    
    
      4
      4
      0
      0
      0
      0
      1
      0
      0
      0
      0
      ...
      8
      9
      9
      88
      9
      1.05
      0.0
      1.0
      0.0
      0
    
  

5 rows × 77 columns



In [91]:

    
df_dummies = data[['host_response_rate', 'host_is_superhost',
       'host_total_listings_count', 'host_has_profile_pic',
       'host_identity_verified', 'accommodates', 'bathrooms', 'bedrooms',
       'beds', 'price', 'security_deposit', 'cleaning_fee', 'guests_included',
       'extra_people', 'minimum_nights', 'maximum_nights', 'availability_365',
       'number_of_reviews', 'review_scores_rating', 'review_scores_accuracy',
       'review_scores_cleanliness', 'review_scores_checkin',
       'review_scores_communication', 'review_scores_location',
       'review_scores_value', 'instant_bookable',
       'require_guest_profile_picture', 'require_guest_phone_verification',
       'reviews_per_month', 'nearest_attr_dist', 'nearest_attr_rating',
       '24-hour check-in', 'Family/kid friendly', 'Heating', 'Pets allowed',
       'Internet', 'Smoking allowed', 'Suitable for events',
       'Free parking on premises', 'Pool', 'Private entrance',
       'Lock on bedroom door', 'Wheelchair accessible', 'TV',
       'Indoor fireplace', 'Private living room', 'Pets live on this property',
       'Elevator in building', 'Free parking on street',
       'Paid parking off premises', 'Other pet(s)', 'Gym', 'Air conditioning',
       'Kitchen', 'Cable TV', 'Breakfast', 'host_since_days',
       'neighbourhood_Ciutat Vella', 'neighbourhood_Eixample',
       'neighbourhood_Gràcia', 'neighbourhood_Horta-Guinardó',
       'neighbourhood_Les Corts', 'neighbourhood_Nou Barris',
       'neighbourhood_Sant Andreu', 'neighbourhood_Sant Martí',
       'neighbourhood_Sants-Montjuïc', 'neighbourhood_Sarrià-Sant Gervasi',
       'roomtype_Entire home/apt', 'roomtype_Private room',
       'roomtype_Shared room', 'cancellation_policy_flexible',
       'cancellation_policy_moderate', 'cancellation_policy_strict',
       'cancellation_policy_super_strict_30',
       'cancellation_policy_super_strict_60']]



In [92]:

    
dataframe = df_dummies.dropna()



In [93]:

    
df_dummies1 = dataframe[[
        'accommodates', 'bathrooms', 'bedrooms',
       'beds', 'security_deposit', 'cleaning_fee', 'guests_included',
       'extra_people', 'minimum_nights', 'maximum_nights', 'availability_365',
       'instant_bookable','nearest_attr_dist', 'nearest_attr_rating',
       '24-hour check-in', 'Family/kid friendly', 'Heating', 'Pets allowed',
       'Internet', 'Smoking allowed', 'Suitable for events',
       'Free parking on premises', 'Pool', 'Private entrance',
       'Lock on bedroom door', 'Wheelchair accessible', 'TV',
       'Indoor fireplace', 'Private living room', 'Pets live on this property',
       'Elevator in building', 'Free parking on street',
       'Paid parking off premises', 'Other pet(s)', 'Gym', 'Air conditioning',
       'Kitchen', 'Cable TV', 'Breakfast', 
       'neighbourhood_Ciutat Vella', 'neighbourhood_Eixample',
       'neighbourhood_Gràcia', 'neighbourhood_Horta-Guinardó',
       'neighbourhood_Les Corts', 'neighbourhood_Nou Barris',
       'neighbourhood_Sant Andreu', 'neighbourhood_Sant Martí',
       'neighbourhood_Sants-Montjuïc', 'neighbourhood_Sarrià-Sant Gervasi',
       'roomtype_Entire home/apt', 'roomtype_Private room',
       'roomtype_Shared room', 'cancellation_policy_flexible',
       'cancellation_policy_moderate', 'cancellation_policy_strict',
       'cancellation_policy_super_strict_30',
       'cancellation_policy_super_strict_60']]
#'host_response_rate', 'host_is_superhost','number_of_reviews''host_since_days'

Linear Regression



In [95]:

    
y = dataframe["price"]
X = df_dummies1



In [96]:

    
lm = LinearRegression()

Split data into training and testing data + fit into Linear Regression Model



In [97]:

    
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
lm.fit(X_train, y_train)
pred_train = lm.predict(X_train)
pred_test = lm.predict(X_test)









    



/Applications/anaconda/lib/python3.5/site-packages/scipy/linalg/basic.py:1018: RuntimeWarning: internal gelsd driver lwork query error, required iwork dimension not returned. This is likely the result of LAPACK bug 0038, fixed in LAPACK 3.2.2 (released July 21, 2010). Falling back to 'gelss' driver.
  warnings.warn(mesg, RuntimeWarning)



In [98]:

    
print("MSE:",mean_squared_error(pred_test,y_test))
r=lm.score(X_train,y_train) ## or sklearn.metrics.r2_score(y_test,pred_test)
N = X_train.shape[0]
n_predictors = len(X_train.iloc[0])
r_square_adj = 1-((1-r)*(N-1)/(N-n_predictors-1))
print("r square:", r)
print("r square adj:", r_square_adj)









    



MSE: 3823.02339645
r square: 0.41162589796
r square adj: 0.408170935169

Linear Regression Results



In [99]:

    
X_train_constant = sm.add_constant(X_train)
est = sm.OLS(np.asarray(y_train), np.asarray(X_train_constant))
est2 = est.fit()
print(est2.summary())









    



                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.412
Model:                            OLS   Adj. R-squared:                  0.408
Method:                 Least Squares   F-statistic:                     128.2
Date:                Sun, 09 Apr 2017   Prob (F-statistic):               0.00
Time:                        20:23:38   Log-Likelihood:                -54982.
No. Observations:                9765   AIC:                         1.101e+05
Df Residuals:                    9711   BIC:                         1.105e+05
Df Model:                          53                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
x1             4.8785      0.805      6.059      0.000         3.300     6.457
x2            18.5676      1.366     13.589      0.000        15.889    21.246
x3            17.8359      1.268     14.070      0.000        15.351    20.321
x4            -0.9345      0.833     -1.121      0.262        -2.568     0.699
x5             0.0542      0.006      9.807      0.000         0.043     0.065
x6             0.2435      0.035      6.981      0.000         0.175     0.312
x7             2.4058      0.686      3.508      0.000         1.061     3.750
x8            -0.4537      0.069     -6.586      0.000        -0.589    -0.319
x9            -0.3481      0.112     -3.120      0.002        -0.567    -0.129
x10        -3.018e-08   2.27e-08     -1.332      0.183     -7.46e-08  1.42e-08
x11        -4.975e-05      0.006     -0.009      0.993        -0.011     0.011
x12           -0.9282      1.558     -0.596      0.551        -3.982     2.126
x13          -33.7995      5.187     -6.516      0.000       -43.967   -23.632
x14           -1.6189      1.159     -1.397      0.163        -3.891     0.653
x15            1.8765      1.605      1.169      0.242        -1.270     5.023
x16           -5.2066      1.673     -3.111      0.002        -8.487    -1.926
x17            1.0416      1.811      0.575      0.565        -2.508     4.591
x18            2.4265      2.021      1.201      0.230        -1.534     6.387
x19           -5.6857      1.508     -3.771      0.000        -8.641    -2.730
x20           -3.4942      1.590     -2.197      0.028        -6.611    -0.377
x21            4.4982      3.348      1.344      0.179        -2.064    11.061
x22           -2.4515      3.626     -0.676      0.499        -9.560     4.657
x23           42.4164      5.179      8.190      0.000        32.264    52.569
const       8.274e-12    4.6e-11      0.180      0.857     -8.19e-11  9.84e-11
x24            0.7599      2.946      0.258      0.796        -5.016     6.536
x25           -0.3734      2.266     -0.165      0.869        -4.815     4.068
x26           -1.6324      1.707     -0.956      0.339        -4.978     1.713
x27           14.4822      5.205      2.782      0.005         4.279    24.685
x28          -16.5992     33.931     -0.489      0.625       -83.111    49.913
x29            1.3879      2.639      0.526      0.599        -3.785     6.560
x30            5.2000      1.616      3.218      0.001         2.032     8.368
x31           25.5241     27.965      0.913      0.361       -29.294    80.342
x32          -26.2918     47.964     -0.548      0.584      -120.312    67.728
x33            1.3137     15.006      0.088      0.930       -28.102    30.729
x34           40.3075      7.864      5.126      0.000        24.892    55.723
x35            7.4940      1.742      4.303      0.000         4.080    10.908
x36           -0.1596      2.629     -0.061      0.952        -5.314     4.995
x37           13.1901      2.141      6.161      0.000         8.994    17.386
x38            1.2332      2.490      0.495      0.620        -3.647     6.113
x39            0.9130      2.669      0.342      0.732        -4.318     6.144
x40            0.7601      2.520      0.302      0.763        -4.181     5.701
x41           -7.3887      2.783     -2.655      0.008       -12.844    -1.934
x42           -2.8443      4.170     -0.682      0.495       -11.018     5.329
x43          -10.6387      5.152     -2.065      0.039       -20.737    -0.540
x44            5.8096      6.813      0.853      0.394        -7.544    19.164
x45          -15.3820      5.507     -2.793      0.005       -26.177    -4.587
x46            3.1419      2.880      1.091      0.275        -2.504     8.788
x47          -10.2549      2.793     -3.672      0.000       -15.730    -4.780
x48           -7.1872      4.021     -1.788      0.074       -15.068     0.694
x49           11.4153      6.300      1.812      0.070        -0.933    23.764
x50          -16.1267      6.106     -2.641      0.008       -28.096    -4.157
x51          -38.3598      8.302     -4.621      0.000       -54.633   -22.087
x52           19.2004      7.099      2.705      0.007         5.284    33.117
x53           15.4911      7.095      2.183      0.029         1.583    29.399
x54           17.7116      7.067      2.506      0.012         3.858    31.565
x55          -41.8015     25.003     -1.672      0.095       -90.812     7.209
x56          -53.6727     34.312     -1.564      0.118      -120.931    13.585
==============================================================================
Omnibus:                    22556.053   Durbin-Watson:                   2.061
Prob(Omnibus):                  0.000   Jarque-Bera (JB):        471550190.955
Skew:                          22.054   Prob(JB):                         0.00
Kurtosis:                    1078.644   Cond. No.                     9.02e+15
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 1.13e-13. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.

Feature Selection



In [100]:

    
%matplotlib inline
import pandas as pd
import numpy as np
import itertools
import time
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import train_test_split



In [140]:

    
y = dataframe["price"]
X = df_dummies1

Define Forward and Backward Selection codes



In [141]:

    
def processSubset(feature_set):
# Fit model on feature_set and calculate RSS
    lm = LinearRegression()
    lm.fit(X[feature_set], y)
    #RSS = ((regr.predict(X[list(feature_set)]) - np.mean(y)) ** 2).sum()
    r = lm.score(X[feature_set], y)
    #adjr =regr.rsquared_adj
    return {"Predictors": feature_set, "model":lm, "Rsquared":r}



In [142]:

    
def getBest(k, colsToKeep = [], colsToDrop = []):
    tic = time.time()
    newX = X.drop((colsToDrop+colsToKeep), axis=1)
    results = []
    for combo in itertools.combinations(newX, k):
        results.append(processSubset(list(combo)+colsToKeep)) # Wrap everything up in a nice dataframe
    models = pd.DataFrame(results)
# Choose the model with the highest RSS
    best_model = models.loc[models["Rsquared"].argmax()]
    toc = time.time()
    #print(best_model["Predictors"])
    print(best_model["Rsquared"])
    #print(best_model["adjRsquared"])
    print("Processed ", models.shape[0], "models on", k+len(colsToKeep), "predictors in", (toc-tic), "seconds.") 
    # Return the best model, along with some other useful information about the model
    return best_model



In [143]:

    
def forward(criteria):
    tic = time.time()
    minchange = criteria
    predictors = []
    r = 0
    best_model = {}
    while len(predictors) < len(X.columns):
        model = getBest(1,predictors, [])
        modelr = model["Rsquared"]
        if (modelr-r<criteria):
            break
        best_model = model
        predictors = model["Predictors"]
        r = modelr
    toc = time.time()
    print("Forward Selction: ", (toc-tic), "seconds")
    return best_model



In [144]:

    
def backward(criteria):
# Pull out predictors we still need to process
    tic = time.time()
    minchange = criteria
    lm = LinearRegression()
    lm.fit(X, y)
    r = lm.score(X, y)
    predictors = X.columns
    colsToDrop = []
    best_model = {"Predictors": X.columns, "model":lm, "Rsquared":r}
    while len(predictors) > 0:
        model = getBest((len(predictors)-1),[],colsToDrop)
        modelr = model["Rsquared"]
        if (r-modelr>criteria):
            break
        best_model = model
        colsToDrop.extend(list(set(predictors) - set(model["Predictors"])))
        predictors = model["Predictors"]
        r = modelr
    toc = time.time()
    print("Backward Selction: ", (toc-tic), "seconds")
    return best_model



In [145]:

    
model0 = forward(0.00055)









    



0.298934660203
Processed  57 models on 1 predictors in 0.146956205368042 seconds.
0.33832403961
Processed  56 models on 2 predictors in 0.1799790859222412 seconds.
0.358646174217
Processed  55 models on 3 predictors in 0.18277406692504883 seconds.
0.371316610034
Processed  54 models on 4 predictors in 0.1966390609741211 seconds.
0.380066366233
Processed  53 models on 5 predictors in 0.2458181381225586 seconds.
0.395376821169
Processed  52 models on 6 predictors in 0.2595219612121582 seconds.
0.400575434741
Processed  51 models on 7 predictors in 0.264024019241333 seconds.
0.405839162494
Processed  50 models on 8 predictors in 0.3078019618988037 seconds.
0.408679716684
Processed  49 models on 9 predictors in 0.34799695014953613 seconds.
0.410272181177
Processed  48 models on 10 predictors in 0.32643890380859375 seconds.
0.411705935802
Processed  47 models on 11 predictors in 0.3740389347076416 seconds.
0.412816558812
Processed  46 models on 12 predictors in 0.4386940002441406 seconds.
0.414013934771
Processed  45 models on 13 predictors in 0.4263601303100586 seconds.
0.41514360785
Processed  44 models on 14 predictors in 0.43903207778930664 seconds.
0.415856357941
Processed  43 models on 15 predictors in 0.4996500015258789 seconds.
0.416520946845
Processed  42 models on 16 predictors in 0.5092720985412598 seconds.
0.417173570144
Processed  41 models on 17 predictors in 0.47203493118286133 seconds.
0.417817715093
Processed  40 models on 18 predictors in 0.5668189525604248 seconds.
0.418496624203
Processed  39 models on 19 predictors in 0.5907201766967773 seconds.
0.419069563765
Processed  38 models on 20 predictors in 0.5899569988250732 seconds.
0.419549358552
Processed  37 models on 21 predictors in 0.6119301319122314 seconds.
Forward Selction:  7.996230840682983 seconds

Forward Selection features selected



In [146]:

    
model0["Predictors"]









    Out[146]:





['neighbourhood_Sant Andreu',
 'Elevator in building',
 'Internet',
 'Indoor fireplace',
 'Family/kid friendly',
 'neighbourhood_Gràcia',
 'Gym',
 'guests_included',
 'extra_people',
 'Air conditioning',
 'neighbourhood_Sants-Montjuïc',
 'Cable TV',
 'Pool',
 'nearest_attr_dist',
 'roomtype_Entire home/apt',
 'bathrooms',
 'security_deposit',
 'bedrooms',
 'cleaning_fee',
 'accommodates']



In [147]:

    
model1 = backward(0.00055)









    



0.422691201857
Processed  57 models on 56 predictors in 3.6771469116210938 seconds.
0.422691201857
Processed  56 models on 55 predictors in 3.4409780502319336 seconds.
0.422691201857
Processed  55 models on 54 predictors in 3.111250877380371 seconds.
0.422691201857
Processed  54 models on 53 predictors in 3.030025005340576 seconds.
0.422691200936
Processed  53 models on 52 predictors in 3.3295059204101562 seconds.
0.42269049434
Processed  52 models on 51 predictors in 2.8878989219665527 seconds.
0.422689406775
Processed  51 models on 50 predictors in 2.6834959983825684 seconds.
0.422684802128
Processed  50 models on 49 predictors in 2.680804967880249 seconds.
0.422679037124
Processed  49 models on 48 predictors in 2.6422650814056396 seconds.
0.422671576731
Processed  48 models on 47 predictors in 2.556030035018921 seconds.
0.422660299491
Processed  47 models on 46 predictors in 2.3463919162750244 seconds.
0.42264822226
Processed  46 models on 45 predictors in 2.4494080543518066 seconds.
0.422626156596
Processed  45 models on 44 predictors in 2.144435167312622 seconds.
0.422603353623
Processed  44 models on 43 predictors in 2.0105979442596436 seconds.
0.422579347661
Processed  43 models on 42 predictors in 1.9088590145111084 seconds.
0.422554162703
Processed  42 models on 41 predictors in 1.731584072113037 seconds.
0.422526379452
Processed  41 models on 40 predictors in 1.6532518863677979 seconds.
0.422496708172
Processed  40 models on 39 predictors in 1.57368803024292 seconds.
0.422464661268
Processed  39 models on 38 predictors in 1.527198076248169 seconds.
0.422429757161
Processed  38 models on 37 predictors in 1.3817448616027832 seconds.
0.422387802721
Processed  37 models on 36 predictors in 1.286344051361084 seconds.
0.42234415406
Processed  36 models on 35 predictors in 1.246183156967163 seconds.
0.422282363492
Processed  35 models on 34 predictors in 1.0897390842437744 seconds.
0.422205910762
Processed  34 models on 33 predictors in 1.0299420356750488 seconds.
0.422104770547
Processed  33 models on 32 predictors in 0.8592820167541504 seconds.
0.421981417608
Processed  32 models on 31 predictors in 0.7988929748535156 seconds.
0.421850987097
Processed  31 models on 30 predictors in 0.7704031467437744 seconds.
0.42168895409
Processed  30 models on 29 predictors in 0.7044811248779297 seconds.
0.421524074075
Processed  29 models on 28 predictors in 0.6968028545379639 seconds.
0.421388460638
Processed  28 models on 27 predictors in 0.6877419948577881 seconds.
0.421205556652
Processed  27 models on 26 predictors in 0.5735280513763428 seconds.
0.420952550107
Processed  26 models on 25 predictors in 0.5913848876953125 seconds.
0.420689650809
Processed  25 models on 24 predictors in 0.4840281009674072 seconds.
0.420360216461
Processed  24 models on 23 predictors in 0.41344499588012695 seconds.
0.41994314702
Processed  23 models on 22 predictors in 0.4111771583557129 seconds.
0.419515064023
Processed  22 models on 21 predictors in 0.3575260639190674 seconds.
0.41903094174
Processed  21 models on 20 predictors in 0.34836697578430176 seconds.
0.418386328663
Processed  20 models on 19 predictors in 0.3024120330810547 seconds.
Backward Selction:  61.521934032440186 seconds

Backward Selection features selected



In [148]:

    
model1["Predictors"]









    Out[148]:





['accommodates',
 'bathrooms',
 'bedrooms',
 'security_deposit',
 'cleaning_fee',
 'guests_included',
 'extra_people',
 'nearest_attr_dist',
 'Family/kid friendly',
 'Internet',
 'Pool',
 'Indoor fireplace',
 'Elevator in building',
 'Gym',
 'Air conditioning',
 'Cable TV',
 'neighbourhood_Ciutat Vella',
 'neighbourhood_Eixample',
 'neighbourhood_Sant Martí',
 'roomtype_Entire home/apt']

Linear Regression after Feature Selection

forward selection



In [165]:

    
y = dataframe["price"]



In [166]:

    
feature_forward = dataframe[['neighbourhood_Sant Andreu',
 'Elevator in building',
 'Internet',
 'Indoor fireplace',
 'Family/kid friendly',
 'neighbourhood_Gràcia',
 'Gym',
 'guests_included',
 'extra_people',
 'Air conditioning',
 'neighbourhood_Sants-Montjuïc',
 'Cable TV',
 'Pool',
 'nearest_attr_dist',
 'roomtype_Entire home/apt',
 'bathrooms',
 'security_deposit',
 'bedrooms',
 'cleaning_fee',
 'accommodates']]



In [167]:

    
X = feature_forward
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
lm.fit(X_train, y_train)
pred_train = lm.predict(X_train)
pred_test = lm.predict(X_test)



In [168]:

    
print("MSE:",mean_squared_error(pred_test,y_test))
r=lm.score(X_train,y_train) ## or sklearn.metrics.r2_score(y_test,pred_test)
N = X_train.shape[0]
n_predictors = len(X_train.iloc[0])
r_square_adj = 1-((1-r)*(N-1)/(N-n_predictors-1))
print("r square:", r)
print("r square adj:", r_square_adj)









    



MSE: 3836.70226625
r square: 0.40785432497
r square adj: 0.406638919233



In [169]:

    
X_train_constant = sm.add_constant(X_train)
est = sm.OLS(np.asarray(y_train), np.asarray(X_train_constant))
est2 = est.fit()
print(est2.summary())









    



                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.408
Model:                            OLS   Adj. R-squared:                  0.407
Method:                 Least Squares   F-statistic:                     335.6
Date:                Sun, 09 Apr 2017   Prob (F-statistic):               0.00
Time:                        20:35:22   Log-Likelihood:                -55013.
No. Observations:                9765   AIC:                         1.101e+05
Df Residuals:                    9744   BIC:                         1.102e+05
Df Model:                          20                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const         -6.0464      2.304     -2.624      0.009       -10.563    -1.530
x1           -16.2688      5.664     -2.872      0.004       -27.371    -5.167
x2             5.3588      1.464      3.661      0.000         2.490     8.228
x3            -5.3549      1.445     -3.706      0.000        -8.187    -2.523
x4            14.1791      5.154      2.751      0.006         4.077    24.281
x5            -5.1555      1.636     -3.151      0.002        -8.362    -1.949
x6            -7.4907      2.311     -3.242      0.001       -12.020    -2.961
x7            40.8384      7.838      5.210      0.000        25.474    56.203
x8             2.6258      0.681      3.854      0.000         1.290     3.961
x9            -0.4338      0.068     -6.361      0.000        -0.567    -0.300
x10            7.3806      1.633      4.519      0.000         4.179    10.582
x11          -10.7909      2.185     -4.939      0.000       -15.074    -6.508
x12           13.3200      2.126      6.266      0.000         9.153    17.487
x13           41.1300      5.043      8.156      0.000        31.245    51.015
x14          -34.6634      4.182     -8.289      0.000       -42.861   -26.466
x15           27.2917      2.100     12.994      0.000        23.175    31.409
x16           18.0918      1.336     13.542      0.000        15.473    20.711
x17            0.0534      0.005      9.904      0.000         0.043     0.064
x18           17.5908      1.223     14.382      0.000        15.193    19.988
x19            0.2468      0.034      7.199      0.000         0.180     0.314
x20            4.2241      0.591      7.143      0.000         3.065     5.383
==============================================================================
Omnibus:                    22446.432   Durbin-Watson:                   2.064
Prob(Omnibus):                  0.000   Jarque-Bera (JB):        455031459.486
Skew:                          21.795   Prob(JB):                         0.00
Kurtosis:                    1059.625   Cond. No.                     2.29e+03
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 2.29e+03. This might indicate that there are
strong multicollinearity or other numerical problems.

backward selection



In [170]:

    
feature_backward = dataframe[['accommodates',
 'bathrooms',
 'bedrooms',
 'security_deposit',
 'cleaning_fee',
 'guests_included',
 'extra_people',
 'nearest_attr_dist',
 'Family/kid friendly',
 'Internet',
 'Pool',
 'Indoor fireplace',
 'Elevator in building',
 'Gym',
 'Air conditioning',
 'Cable TV',
 'neighbourhood_Gràcia',
 'neighbourhood_Sant Andreu',
 'neighbourhood_Sants-Montjuïc',
 'roomtype_Entire home/apt']]



In [171]:

    
X = feature_backward
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
lm.fit(X_train, y_train)
pred_train = lm.predict(X_train)
pred_test = lm.predict(X_test)



In [172]:

    
print("MSE:",mean_squared_error(pred_test,y_test))
r=lm.score(X_train,y_train) ## or sklearn.metrics.r2_score(y_test,pred_test)
N = X_train.shape[0]
n_predictors = len(X_train.iloc[0])
r_square_adj = 1-((1-r)*(N-1)/(N-n_predictors-1))
print("r square:", r)
print("r square adj:", r_square_adj)









    



MSE: 3836.70226625
r square: 0.40785432497
r square adj: 0.406638919233



In [173]:

    
X_train_constant = sm.add_constant(X_train)
est = sm.OLS(np.asarray(y_train), np.asarray(X_train_constant))
est2 = est.fit()
print(est2.summary())









    



                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.408
Model:                            OLS   Adj. R-squared:                  0.407
Method:                 Least Squares   F-statistic:                     335.6
Date:                Sun, 09 Apr 2017   Prob (F-statistic):               0.00
Time:                        20:36:15   Log-Likelihood:                -55013.
No. Observations:                9765   AIC:                         1.101e+05
Df Residuals:                    9744   BIC:                         1.102e+05
Df Model:                          20                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const         -6.0464      2.304     -2.624      0.009       -10.563    -1.530
x1             4.2241      0.591      7.143      0.000         3.065     5.383
x2            18.0918      1.336     13.542      0.000        15.473    20.711
x3            17.5908      1.223     14.382      0.000        15.193    19.988
x4             0.0534      0.005      9.904      0.000         0.043     0.064
x5             0.2468      0.034      7.199      0.000         0.180     0.314
x6             2.6258      0.681      3.854      0.000         1.290     3.961
x7            -0.4338      0.068     -6.361      0.000        -0.567    -0.300
x8           -34.6634      4.182     -8.289      0.000       -42.861   -26.466
x9            -5.1555      1.636     -3.151      0.002        -8.362    -1.949
x10           -5.3549      1.445     -3.706      0.000        -8.187    -2.523
x11           41.1300      5.043      8.156      0.000        31.245    51.015
x12           14.1791      5.154      2.751      0.006         4.077    24.281
x13            5.3588      1.464      3.661      0.000         2.490     8.228
x14           40.8384      7.838      5.210      0.000        25.474    56.203
x15            7.3806      1.633      4.519      0.000         4.179    10.582
x16           13.3200      2.126      6.266      0.000         9.153    17.487
x17           -7.4907      2.311     -3.242      0.001       -12.020    -2.961
x18          -16.2688      5.664     -2.872      0.004       -27.371    -5.167
x19          -10.7909      2.185     -4.939      0.000       -15.074    -6.508
x20           27.2917      2.100     12.994      0.000        23.175    31.409
==============================================================================
Omnibus:                    22446.432   Durbin-Watson:                   2.064
Prob(Omnibus):                  0.000   Jarque-Bera (JB):        455031459.486
Skew:                          21.795   Prob(JB):                         0.00
Kurtosis:                    1059.625   Cond. No.                     2.29e+03
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 2.29e+03. This might indicate that there are
strong multicollinearity or other numerical problems.

Classification



In [174]:

    
import seaborn as sns



In [175]:

    
df_dummies = dataframe[['accommodates', 'bathrooms', 'bedrooms',
       'beds', 'price', 'security_deposit', 'cleaning_fee', 'guests_included',
       'extra_people', 'minimum_nights', 'maximum_nights', 'availability_365',
       'review_scores_rating', 'review_scores_accuracy',
       'review_scores_cleanliness', 'review_scores_checkin',
       'review_scores_communication', 'review_scores_location',
       'review_scores_value', 'instant_bookable',
       'reviews_per_month', 'nearest_attr_dist', 'nearest_attr_rating',
       'neighbourhood_Ciutat Vella', 'neighbourhood_Eixample',
       'neighbourhood_Gràcia', 'neighbourhood_Horta-Guinardó',
       'neighbourhood_Les Corts', 'neighbourhood_Nou Barris',
       'neighbourhood_Sant Andreu', 'neighbourhood_Sant Martí',
       'neighbourhood_Sants-Montjuïc', 'neighbourhood_Sarrià-Sant Gervasi',
       'roomtype_Entire home/apt', 'roomtype_Private room',
       'roomtype_Shared room', 'cancellation_policy_flexible',
       'cancellation_policy_moderate', 'cancellation_policy_strict',
       'cancellation_policy_super_strict_30',
       'cancellation_policy_super_strict_60']]



In [176]:

    
X = df_dummies

Boxplot to see distribution of reviews per month



In [177]:

    
sns.set_style("whitegrid")



In [178]:

    
ax = sns.boxplot(x=df_dummies["reviews_per_month"])

Label data as good if there are fewer than 3 reviews a month and best otherwise



In [179]:

    
good = [x for x in df_dummies["reviews_per_month"] if x < 3]



In [180]:

    
best = [x for x in df_dummies["reviews_per_month"] if x >= 3]



In [181]:

    
df = dataframe[['require_guest_profile_picture', 'require_guest_phone_verification',
               'host_total_listings_count', 'host_has_profile_pic',
       'host_identity_verified','host_response_rate', 'host_is_superhost','accommodates', 'bathrooms', 'bedrooms',
       'beds', 'price', 'security_deposit', 'cleaning_fee', 'guests_included',
       'extra_people', 'minimum_nights', 'maximum_nights', 'availability_365',
       'review_scores_rating', 'review_scores_accuracy',
       'review_scores_cleanliness', 'review_scores_checkin',
       'review_scores_communication', 'review_scores_location',
       'review_scores_value', 'instant_bookable',
       'nearest_attr_dist', 'nearest_attr_rating',
       'neighbourhood_Ciutat Vella', 'neighbourhood_Eixample',
       'neighbourhood_Gràcia', 'neighbourhood_Horta-Guinardó',
       'neighbourhood_Les Corts', 'neighbourhood_Nou Barris',
       'neighbourhood_Sant Andreu', 'neighbourhood_Sant Martí',
       'neighbourhood_Sants-Montjuïc', 'neighbourhood_Sarrià-Sant Gervasi',
       'roomtype_Entire home/apt', 'roomtype_Private room',
       'roomtype_Shared room', 'cancellation_policy_flexible',
       'cancellation_policy_moderate', 'cancellation_policy_strict',
       'cancellation_policy_super_strict_30',
       'cancellation_policy_super_strict_60',"reviews_per_month"]]



In [182]:

    
cat = pd.cut(df['reviews_per_month'], bins=(0, 3, 10), include_lowest=True,labels=[0, 1])



In [183]:

    
df['class_reviews'] = cat









    



/Applications/anaconda/lib/python3.5/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':

Convert labels of good and best reviews to integer



In [184]:

    
low = df.loc[df['class_reviews'] == 0]



In [185]:

    
high = df.loc[df['class_reviews'] == 1]

Decision Tree



In [188]:

    
from sklearn import tree



In [201]:

    
from sklearn.cross_validation import train_test_split
import statsmodels.api as sm
import sklearn.metrics



In [212]:

    
clf = tree.DecisionTreeClassifier() #15

GridSearch to obtain best parameters for the Decision Tree



In [226]:

    
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import StratifiedKFold

parameter_grid = {'max_depth': [4,5,6,7,8],
                  "min_samples_leaf": [100],
                  "max_leaf_nodes": [6,7,8,9,10]}

cross_validation = StratifiedKFold(df.class_reviews, n_folds=10)

grid_search = GridSearchCV(clf,
                           param_grid=parameter_grid,
                           cv=cross_validation)

grid_search.fit(X, y)
print('Best parameters: {}'.format(grid_search.best_params_))









    



Best parameters: {'max_depth': 4, 'min_samples_leaf': 100, 'max_leaf_nodes': 7}



In [227]:

    
clf = tree.DecisionTreeClassifier(max_depth=4,min_samples_leaf=100,max_leaf_nodes=7) #15



In [228]:

    
X = df[['accommodates', 'bathrooms', 'bedrooms',
       'beds', 'price', 'security_deposit', 'cleaning_fee', 'guests_included',
       'extra_people', 'minimum_nights', 'maximum_nights',
       'instant_bookable',
       'nearest_attr_dist', 'nearest_attr_rating',
       'roomtype_Entire home/apt', 'roomtype_Private room',
       'roomtype_Shared room', 'cancellation_policy_flexible',
       'cancellation_policy_moderate', 'cancellation_policy_strict',
       'cancellation_policy_super_strict_30',
       'cancellation_policy_super_strict_60']]



In [229]:

    
feature_names=['accommodates', 'bathrooms', 'bedrooms',
       'beds', 'price', 'security_deposit', 'cleaning_fee', 'guests_included',
       'extra_people', 'minimum_nights', 'maximum_nights',
       'instant_bookable',
       'nearest_attr_dist', 'nearest_attr_rating',
       'roomtype_Entire home/apt', 'roomtype_Private room',
       'roomtype_Shared room', 'cancellation_policy_flexible',
       'cancellation_policy_moderate', 'cancellation_policy_strict',
       'cancellation_policy_super_strict_30',
       'cancellation_policy_super_strict_60']



In [230]:

    
target_names=["low_popularity","high_popularity"]



In [231]:

    
y = df[["class_reviews"]]



In [232]:

    
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)



In [233]:

    
clf = clf.fit(X_train, y_train)



In [234]:

    
pred_train = clf.predict(X_train)
pred_test = clf.predict(X_test)



In [238]:

    
from IPython.display import Image



In [239]:

    
import pydotplus 
dot_data = tree.export_graphviz(clf, out_file=None) 
graph = pydotplus.graph_from_dot_data(dot_data) 
graph.write_pdf("airbnb.pdf")









    Out[239]:





True



In [240]:

    
dot_data = tree.export_graphviz(clf, out_file=None, 
                         feature_names=feature_names,  
                         class_names=target_names,  
                         filled=True, rounded=True,  
                         special_characters=True)



In [241]:

    
graph = pydotplus.graph_from_dot_data(dot_data)



In [242]:

    
Image(graph.create_png())









    Out[242]:



In [243]:

    
# Decision Tree accuracy
clf.score(X_test, y_test)









    Out[243]:





0.81075268817204305



In [ ]:



In [ ]:

		24-hour check-in	Air conditioning	Breakfast	Cable TV	Elevator in building	Family/kid friendly	...	review_scores_cleanliness	review_scores_communication	review_scores_location	review_scores_rating	review_scores_value	reviews_per_month	roomtype_Private room	security_deposit
0	0	1	0	1	0	1	0	...	10	10	10	95	10	2.87	1.0	100
1	1	0	1	1	1	1	0	...	10	10	10	95	9	3.12	1.0	200
2	2	0	0	0	0	1	1	...	5	8	9	65	7	1.14	1.0	0
3	3	0	0	0	0	0	0	...	10	9	9	92	9	1.42	1.0	100
4	4	0	0	0	0	1	0	...	8	9	9	88	9	1.05	1.0	0